In this vignette, we demonstrate the capability of SCEG-HiC to infer enhancer–gene links by applying it to paired scATAC-seq/RNA-seq data. Specifically, we use a publicly available SHARE-seq dataset for mouse skin and process the data using the aggregation approach.

The implementation of SCEG-HiC is seamlessly compatible with the standard workflow of the Seurat/Signac packages. The SCEG-HiC pipeline consists of the following three main steps:

View data download code

You can download the required data from GSE140203 by running the following commands in a shell:

wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4156nnn/GSM4156597/suppl/GSM4156597%5Fskin.late.anagen.peaks.bed.gz
wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4156nnn/GSM4156597/suppl/GSM4156597%5Fskin.late.anagen.barcodes.txt.gz
wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4156nnn/GSM4156597/suppl/GSM4156597%5Fskin.late.anagen.counts.txt.gz
wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4156nnn/GSM4156597/suppl/GSM4156597%5Fskin%5Fcelltype.txt.gz
wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4156nnn/GSM4156608/suppl/GSM4156608%5Fskin.late.anagen.rna.counts.txt.gz
wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4156nnn/GSM4156597/suppl/GSM4156597%5Fskin.late.anagen.atac.fragments.bed.gz

The SHARE-seq dataset requires preprocessing to obtain the fragment file needed for downstream analysis. You can generate this file by running the following commands in a shell:

# Decompress the original gzipped fragment file
gunzip GSM4156597_skin.late.anagen.atac.fragments.bed.gz

# Replace all commas with dots to fix formatting issues
sed -i "s/,/./g" GSM4156597_skin.late.anagen.atac.fragments.bed

# Sort the BED file by chromosome (version sort), start, and end; count unique occurrences; rearrange columns to place counts last; compress output with bgzip
sort -k1,1V -k2,2n -k3,3n GSM4156597_skin.late.anagen.atac.fragments.bed | uniq -c | awk '{print $2,$3,$4,$5,$1}' OFS="\t" | bgzip -c > GSM4156597_skin.late.anagen.atac.fragments.mybed.gz

# Index the compressed BED file using tabix for fast access during downstream analysis
# Before indexing, make sure tabix is installed. You can install it via Conda: conda install bioconda::tabix
tabix -0 -p bed GSM4156597_skin.late.anagen.atac.fragments.mybed.gz

Load the required libraries

library(SCEGHiC)
library(Seurat)
library(Signac)
library(EnsDb.Mmusculus.v79) # Annotation for mouse skin
library(dplyr)
library(Matrix)
library(GenomicRanges)
library(ggplot2)

Set up the Seurat Object

To facilitate easy exploration, mouse_skin_multiomic.rds file is also available at 10.5281/zenodo.14849886.

Note: Due to the stochastic nature of SCTransform, particularly in generating scale.data, the UMAP embedding produced by re-running the code may differ slightly from that in the provided mouse_skin_multiomic.rds file.

Load and preprocess SHARE-seq skin multi-omics data

Load RNA and ATAC count matrices from the SHARE-seq skin multi-omic dataset, align barcodes with original cell type annotations, and convert ATAC peak coordinates into genomic ranges for downstream integrative analysis.

# Load the required RNA and ATAC data
con <- gzfile("GSM4156597_skin_celltype.txt.gz", "rt")
ct <- read.table(con, header = TRUE, sep = "\t")
close(con)
# Columns in 'ct':
#  - First: ATAC barcode
#  - Second: RNA barcode
#  - Third: Annotated cell type

# Load the RNA count matrix (genes × cells)
rna.count <- read.table("GSM4156608_skin.late.anagen.rna.counts.txt.gz", header = T, row.names = 1)

# Match and reorder RNA counts to cell type annotation
any(is.na(match(ct$rna.bc, colnames(rna.count)))) # Check for missing matches
## [1] FALSE
rna.count <- rna.count[, match(ct$rna.bc, colnames(rna.count))]

# Rename RNA count matrix columns to ATAC barcodes for integration
colnames(rna.count) <- ct$atac.bc

# Read ATAC barcodes (should match RNA barcodes)
peak.barcodes <- scan("GSM4156597_skin.late.anagen.barcodes.txt.gz", what = "")
all(ct$rna.bc == peak.barcodes)
## [1] TRUE
# Read ATAC peak matrix in Matrix Market format
peak.count <- readMM("GSM4156597_skin.late.anagen.counts.txt.gz")
dim(peak.count) # Peak × cell dimensions
## [1] 344592  34774
# Load peak coordinates and convert to GRanges object
peak.bed <- read.delim("GSM4156597_skin.late.anagen.peaks.bed.gz", header = FALSE)
peak.granges <- GRanges(seqnames = peak.bed$V1, ranges = IRanges(st = peak.bed$V2, end = peak.bed$V3))

# Assign row and column names to the peak count matrix
rownames(peak.count) <- paste(peak.granges)
colnames(peak.count) <- ct$atac.bc

Generation of a multi-omics Seurat object with RNA and ATAC data

Construct a Seurat object combining RNA expression and ATAC-seq chromatin accessibility data, annotated with cell types based on the original study and genome features from mm10.

# Create a Seurat object containing the RNA count matrix (assay named "RNA")
skin <- CreateSeuratObject(counts = rna.count, assay = "RNA")

# Filter peaks to keep only those on standard chromosomes,
# removing any peaks from non-standard or unplaced contigs
grange.use <- seqnames(peak.granges) %in% standardChromosomes(peak.granges)
peak.count <- peak.count[as.vector(grange.use), ]
peak.granges <- peak.granges[as.vector(grange.use)]

# Get gene annotations from EnsDb.Mmusculus.v79
# Add "chr" prefix to chromosome names to match peak naming conventions
annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79)
seqlevels(annotations) <- paste0("chr", seqlevels(annotations))
genome(annotations) <- "mm10"

# Filter out peaks detected in fewer than 10 cells
peak.count <- peak.count[rowSums(peak.count) > 10, ]

# Specify the path to the ATAC-seq fragments file
fragpath <- "GSM4156597_skin.late.anagen.atac.fragments.mybed.gz"

# Create an ATAC assay using the filtered peak counts and annotations, and add it to the Seurat object
skin[["ATAC"]] <- CreateChromatinAssay(
  counts = peak.count,
  sep = c(":", "-"),
  fragments = fragpath,
  genome = "mm10",
  annotation = annotations
)

# Annotate cell types for each cell in the Seurat object based on the 'ct' table, matching cell names from the Seurat object with those in 'ct'
skin$celltype <- ct[match(colnames(skin), ct[, 1]), 3]
# Print summary of the filtered Seurat object
skin
## An object of class Seurat 
## 367087 features across 34774 samples within 2 assays 
## Active assay: RNA (23296 features, 0 variable features)
##  1 layer present: counts
##  1 other assay present: ATAC

Quality control and filtering

Calculate QC metrics (RNA counts, ATAC counts, mitochondrial percentage, nucleosome signal, TSS enrichment) and filter low-quality cells using defined thresholds.

# Calculate the percentage of mitochondrial gene counts for each cell
# Mitochondrial genes typically start with "MT-"
skin[["percent.mt"]] <- PercentageFeatureSet(skin, pattern = "^MT-")

# Set default assay to ATAC for ATAC-seq quality metrics calculation
DefaultAssay(skin) <- "ATAC"

# Calculate nucleosome signal score, which reflects nucleosome positioning
skin <- NucleosomeSignal(skin)

# Calculate Transcription Start Site (TSS) enrichment score for each cell, a metric for ATAC-seq data quality
skin <- TSSEnrichment(skin)

# Plot violin plots for QC metrics:
# - RNA counts per cell (nCount_RNA)
# - ATAC counts per cell (nCount_ATAC)
# - TSS enrichment score (TSS.enrichment)
# - Nucleosome signal score (nucleosome_signal)
VlnPlot(
  object = skin,
  features = c("nCount_RNA", "nCount_ATAC", "TSS.enrichment", "nucleosome_signal"),
  ncol = 4,
  pt.size = 0
)

# Filter out low-quality cells based on multiple QC thresholds
skin <- subset(
  x = skin,
  subset = nCount_ATAC < 20000 &
    nCount_RNA < 5000 &
    nCount_ATAC > 500 &
    nCount_RNA > 500 &
    nucleosome_signal < 1 &
    TSS.enrichment > 1
)

# Print summary of the filtered Seurat object
skin
## An object of class Seurat 
## 367087 features across 28821 samples within 2 assays 
## Active assay: ATAC (343791 features, 0 variable features)
##  2 layers present: counts, data
##  1 other assay present: RNA

Peak calling and quantification

Call peaks, filter, count fragments per peak, and add to Seurat object.

# Call peaks using MACS2 from the fragments in the Seurat object
peaks <- CallPeaks(skin)

# Remove peaks on nonstandard chromosomes and in genomic blacklist regions
peaks <- keepStandardChromosomes(peaks, pruning.mode = "coarse")
peaks <- subsetByOverlaps(x = peaks, ranges = blacklist_mm10, invert = TRUE)

# Quantify counts of fragments overlapping each peak for each cell, creating a peak-by-cell count matrix
macs2_counts <- FeatureMatrix(
  fragments = Fragments(skin),
  features = peaks,
  cells = colnames(skin)
)
## Extracting reads overlapping genomic regions
# Create a new chromatin assay based on MACS2-called peaks and add it to the Seurat object
skin[["peaks"]] <- CreateChromatinAssay(
  counts = macs2_counts,
  fragments = fragpath,
  annotation = annotations
)
## Computing hash
# Print summary of the updated Seurat object
skin
## An object of class Seurat 
## 562259 features across 28821 samples within 3 assays 
## Active assay: ATAC (343791 features, 0 variable features)
##  2 layers present: counts, data
##  2 other assays present: RNA, peaks

RNA analysis

Normalize RNA data with SCTransform and perform PCA for dimensionality reduction.

# Set the default assay to "RNA" for RNA-based analysis
DefaultAssay(skin) <- "RNA"

# Perform normalization and variance stabilization using SCTransform
# This replaces traditional log-normalization and identifies variable features
skin <- SCTransform(skin)

# Run Principal Component Analysis (PCA) on the normalized data
# PCA reduces dimensionality while preserving major sources of variation
skin <- RunPCA(skin)

ATAC analysis

Identify highly accessible peaks and perform dimensionality reduction using TF-IDF and SVD (LSI).

# Set the default assay to "peaks" (i.e., the MACS2-called peak matrix)
DefaultAssay(skin) <- "peaks"

# Identify top features (peaks) based on accessibility; only keep peaks present in at least 5 cells
skin <- FindTopFeatures(skin, min.cutoff = 5)

# Perform Term Frequency-Inverse Document Frequency (TF-IDF) normalization
skin <- RunTFIDF(skin)

# Perform dimensionality reduction using Singular Value Decomposition (SVD), also referred to as Latent Semantic Indexing (LSI) in this context
skin <- RunSVD(skin)

Intergration analysis

Using Seurat v4’s weighted nearest neighbor method, we integrate gene expression and chromatin accessibility into a joint neighbor graph and create a unified UMAP embedding combining RNA and ATAC data.

# Build a multimodal nearest-neighbor graph using PCA from RNA and LSI from ATAC
skin <- FindMultiModalNeighbors(
  object = skin,
  reduction.list = list("pca", "lsi"),
  dims.list = list(1:50, 2:40),
  modality.weight.name = "RNA.weight",
  verbose = TRUE
)

# Generate a joint UMAP embedding using the WNN graph
# The embedding combines information from both RNA and ATAC modalities
skin <- RunUMAP(
  object = skin,
  nn.name = "weighted.nn",
  assay = "RNA",
  verbose = TRUE
)

# Visualize the UMAP embedding, grouping cells by annotated cell type
DimPlot(skin, reduction = "umap", group.by = "celltype", label = TRUE, label.size = 2.5, repel = TRUE) + NoLegend()